metaviz_long <- rio::import(here::here("data-processed", "metaviz_long.rds"))
First you can see the number of available observation for each variable in each year
empty plot
metaviz_long %>%
drop_na(value) %>%
mutate(key_name_label = factor(key_name_label),
order = as.numeric(key_category),
key_name_label = fct_reorder(key_name_label, desc(order))) %>%
ggplot(aes(key_name_label, syear, col = key_category))
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## mutate: converted 'key_name_label' from character to factor (0 new NA)
## new variable 'order' (double) with 5 unique values and 0% NA
with data points
metaviz_long %>%
drop_na(value) %>%
mutate(key_name_label = factor(key_name_label),
order = as.numeric(key_category),
key_name_label = fct_reorder(key_name_label, desc(order))) %>%
ggplot(aes(key_name_label, syear, col = key_category)) +
geom_count()
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## mutate: converted 'key_name_label' from character to factor (0 new NA)
## new variable 'order' (double) with 5 unique values and 0% NA
flip the axis
metaviz_long %>%
drop_na(value) %>%
mutate(key_name_label = factor(key_name_label),
order = as.numeric(key_category),
key_name_label = fct_reorder(key_name_label, desc(order))) %>%
ggplot(aes(key_name_label, syear, col = key_category)) +
geom_count() +
coord_flip()
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## mutate: converted 'key_name_label' from character to factor (0 new NA)
## new variable 'order' (double) with 5 unique values and 0% NA
final plot
metaviz_long %>%
drop_na(value) %>%
mutate(key_name_label = factor(key_name_label),
order = as.numeric(key_category),
key_name_label = fct_reorder(key_name_label, desc(order))) %>%
ggplot(aes(key_name_label, syear, col = key_category)) +
geom_count() +
coord_flip() +
theme(legend.position = "right",
plot.title.position = "plot") + #so cool <3)
guides(col = guide_legend(ncol = 1)) +
scale_x_discrete(labels = wrap_format(40)) +
scale_y_continuous(limits = c(1998, 2018), breaks = seq(1998,2018,2)) +
labs(title = "Number of observations for selected SOEP variables from 1998 - 2018",
subtitle = "Size indicates number of observations",
y = "", x = "")
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## mutate: converted 'key_name_label' from character to factor (0 new NA)
## new variable 'order' (double) with 5 unique values and 0% NA
Here is an overall plot of the number of available observations for each of the variables. It helps to get a general understanding of the proportions of missings for groups of variables
metaviz_long %>%
drop_na(value) %>%
filter(key_category != "Psych. Measure") %>%
group_by(key) %>%
add_count() %>%
ungroup() %>%
distinct(key, .keep_all = T) %>%
group_by(key_category) %>%
mutate(key_name_label = fct_reorder(factor(key_name_label), n)) %>%
ggplot(aes(x = key_name_label, y = n, fill = key_category, label = n)) +
geom_col(width = 0.2) +
geom_point() +
geom_label(color = "white", size = 2) +
coord_flip() +
scale_y_continuous(labels = scales::label_number_auto()) +
scale_x_discrete(labels = wrap_format(40)) +
theme_light() +
theme(legend.position = "none") +
facet_wrap(~key_category, ncol = 1, scales = "free") +
labs(title = "Overall Number of observations for selected SOEP variables from 1998 - 2018", y = "", x = "")
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 66,568 rows (14%), 409,994 rows remaining
## group_by: one grouping variable (key)
## add_count (grouped): new variable 'n' (integer) with 30 unique values and 0% NA
## ungroup: no grouping variables
## distinct: removed 409,964 rows (>99%), 30 rows remaining
## group_by: one grouping variable (key_category)
## mutate (grouped): converted 'key_name_label' from character to factor (0 new NA)
## By Variable Category {.tabset}
metaviz_long %>%
drop_na(value) %>%
filter(key_category == "ID's") %>%
ggplot(aes(key_name_label, syear)) +
geom_count(col = "#440154FF") +
coord_flip() +
theme(legend.position = "right") +
scale_x_discrete(labels = wrap_format(40)) +
scale_y_continuous(limits = c(1998, 2018), breaks = seq(1998, 2018, 2)) +
labs(title = "Number of observations for selected SOEP variables from 1998 - 2018",
subtitle = "Size indicates number of observations",
y = "", x = "")
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 423,393 rows (89%), 53,169 rows remaining
metaviz_long %>%
drop_na(value) %>%
filter(key_category == "Survey") %>%
ggplot(aes(key_name_label, syear)) +
geom_count(col = "#3B528BFF") +
coord_flip() +
theme(legend.position = "right") +
scale_x_discrete(labels = wrap_format(40)) +
scale_y_continuous(limits = c(1998, 2018),
breaks = seq(1998, 2018, 2)) +
labs(
title = "Number of observations for selected SOEP variables from 1998 - 2018",
subtitle = "Size indicates number of observations",
y = "",
x = ""
)
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 381,757 rows (80%), 94,805 rows remaining
metaviz_long %>%
drop_na(value) %>%
filter(key_category == "Demography") %>%
ggplot(aes(key_name_label, syear)) +
geom_count(col = "#21908CFF") +
coord_flip() +
theme(legend.position = "right") +
scale_x_discrete(labels = wrap_format(40)) +
scale_y_continuous(limits = c(1998, 2018),
breaks = seq(1998, 2018, 2)) +
labs(
title = "Number of observations for selected SOEP variables from 1998 - 2018",
subtitle = "Size indicates number of observations",
y = "",
x = ""
)
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 280,369 rows (59%), 196,193 rows remaining
metaviz_long %>%
drop_na(value) %>%
filter(key_category == "Psych. Measure") %>%
ggplot(aes(key_name_label, syear)) +
geom_count(col = "#5DC863FF") +
coord_flip() +
theme(legend.position = "right") +
scale_x_discrete(labels = wrap_format(40)) +
scale_y_continuous(limits = c(1998, 2018),
breaks = seq(1998, 2018, 2)) +
labs(
title = "Number of observations for selected SOEP variables from 1998 - 2018",
subtitle = "Size indicates number of observations",
y = "",
x = ""
)
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 409,994 rows (86%), 66,568 rows remaining
metaviz_long %>%
drop_na(value) %>%
filter(key_category == "Other") %>%
ggplot(aes(key_name_label, syear)) +
geom_count(col = "#FDE725FF") +
coord_flip() +
theme(legend.position = "right") +
scale_x_discrete(labels = wrap_format(40)) +
scale_y_continuous(limits = c(1998, 2018),
breaks = seq(1998, 2018, 2)) +
labs(
title = "Number of observations for selected SOEP variables from 1998 - 2018",
subtitle = "Size indicates number of observations",
y = "",
x = ""
)
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 410,735 rows (86%), 65,827 rows remaining